This dataset is from Kaggle

Prepare for analysis

suppressMessages(suppressWarnings(setwd("~/../Desktop/Spring2019/MA681_RENEW_PROJECT/SCRIPT/")))
suppressMessages(suppressWarnings(library(tidyverse)))
suppressMessages(suppressWarnings(library(corrplot)))
tidied_data <- readRDS("crime_data.rds")
head(tidied_data,10)

Preprocess of the analysis

# add distric name into the dataset
distric_name <- c("DOWNTOWN AND CHARLESTOWN-1","DOWNTOWN AND CHARLESTOWN-2",
                  "EAST BOSTON","ROXBURY","MATTAPAN","DORCHESTER",
                  "SOUTH BOSTON","BRIGHTON","SOUTH END","JAMAICA PLAIN",
                  "HYDE PARK","WEST ROXBURY","no report")
distric_code <- sort(unique(as.character(tidied_data$DISTRICT)))
tidied_data$district_name <- apply(tidied_data, 1, FUN = function(x){
  id <- which(x[2] == distric_code)
  return(distric_name[id])
})

# add the abbreviations for the week day and month, and reorder them
tidied_data <- tidied_data %>% 
  mutate(MONTH_abb = factor(substr(.$MONTH,1,3),levels = month.abb)) %>%
  mutate(WEEKDAY_abb = factor(
    ifelse(.$DAY_OF_WEEK == "Thursday",
           substr(.$DAY_OF_WEEK,1,4),
           substr(.$DAY_OF_WEEK,1,3)),
    levels = c("Mon","Tue","Wed","Thur","Fri","Sat","Sun")))

# add the YEAR_MONTH and MONTH_DAY variables, and reorder them
tidied_data <- tidied_data %>%
  arrange(YEAR,MONTH_abb)
factor_labels <- paste(as.character(tidied_data$YEAR),
                       as.character(tidied_data$MONTH_abb),sep = "-")
factor_levels <- unique(factor_labels)
tidied_data$YEAR_MONTH <- factor(factor_labels,levels = factor_levels)

tidied_data <- tidied_data %>%
  arrange(MONTH_abb,WEEKDAY_abb) 
factor_labels <- paste(as.character(tidied_data$MONTH_abb),
                       as.character(tidied_data$WEEKDAY_abb),sep = "-")
factor_levels <- unique(factor_labels)
tidied_data$MONTH_DAY <- factor(factor_labels,levels = factor_levels)

# the variables which were added into the dataset
head(tidied_data[,c(2,14,10,3,11,15,16,17,18)],10)

Total number of crimes

tidied_data %>% group_by(crime_date) %>%
  summarize(Occurrenes = n()) %>%
  ggplot(aes(x = as.Date(crime_date),y = Occurrenes, group = 1)) + 
    geom_line()+
    scale_x_date(breaks=as.Date(c("2015-06-15","2016-04-12","2017-02-09","2017-12-06","2018-10-03")))+
    xlab("Crime date")

Number of crimes in different districs in different months. Different districs, and times could be selected.

# the lower and upper bound should be "2015-Jul","2018-Sep", because 2015 jun and 2018 oct data are not complete
PlotDistrictsCrimesMonthly <- function(tidied_data, lower_year_month, upper_year_month, district_list = distric_name){
  x <- tidied_data %>% arrange(YEAR_MONTH)
  
  X_axis <- unique(x$YEAR_MONTH) %>%
    .[grep(lower_year_month,.):grep(upper_year_month,.)]
  BREAKS <- X_axis[floor(as.vector(quantile(1:length(X_axis))))]

  x <- x %>% 
    filter(YEAR_MONTH %in% X_axis) %>%
    filter(district_name %in% district_list) %>%
    group_by(YEAR_MONTH, district_name) %>%
    summarize(Occurrences = n()) %>%
    na.omit()
  
  p <- x %>%
    ggplot(aes(x = YEAR_MONTH,y = Occurrences, group = district_name,color = district_name)) + 
      geom_line()+
      theme(legend.text =  element_text(size = 7)) +
      scale_x_discrete(breaks = BREAKS,label = BREAKS) +
      xlab("Time: Year-Month") +
      ggtitle(paste("Occurrences of crimes in different month from ", lower_year_month, " to ", upper_year_month))
  p
  }

Number of crimes in different districs in different week days. The occurrences are the sums for each month in each year in the dataset.

PlotDistrictsCrimesDaily <- function(tidied_data, Start_Month_Day, End_Month_Day, District_List = distric_name){
  x <- tidied_data %>% arrange(MONTH_DAY)
  
  X_axis <- unique(x$MONTH_DAY) %>%
    .[grep(Start_Month_Day,.):grep(End_Month_Day,.)]
  BREAKS <- X_axis[floor(as.vector(seq(1,length(X_axis),7)))]
  
  x <- x %>% 
    filter(YEAR %in% c(2016,2017)) %>%
    filter(MONTH_DAY %in% X_axis) %>%
    filter(district_name %in% District_List) %>%
    group_by(MONTH_DAY, district_name) %>%
    summarize(Occurrences = n())
  
  p <- x %>%
    ggplot(aes(x = MONTH_DAY,y = Occurrences, group = district_name,color = district_name)) + 
      geom_line()+
      theme(axis.text.x = element_text(size = 7.5))+
      scale_x_discrete(breaks = BREAKS,label = BREAKS) +
      xlab("Time: Month-Day")+
      ggtitle(paste("Occurrences of crimes in different weekday from ", Start_Month_Day, " to ", End_Month_Day))
  p
  }
tidied_data %>% PlotDistrictsCrimesMonthly("2016-Jan","2017-Dec",c("SOUTH BOSTON","SOUTH END","WEST ROXBURY","ROXBURY","EAST BOSTON"))

tidied_data %>% PlotDistrictsCrimesMonthly("2015-Jul","2018-Sep",c("DOWNTOWN AND CHARLESTOWN-1","DOWNTOWN AND CHARLESTOWN-2","EAST BOSTON","SOUTH END"))

tidied_data %>% PlotDistrictsCrimesDaily("Jan-Mon","Apr-Sun",c("DOWNTOWN AND CHARLESTOWN-1","DOWNTOWN AND CHARLESTOWN-2","EAST BOSTON","SOUTH END"))

Boxplot for Occurences in each month

x <- tidied_data %>% 
  filter(YEAR %in% c(2016,2017)) %>%
  group_by(MONTH, crime_date) %>%
  summarize(Occurrences = n())

day_mean <- mean(x$Occurrences)

x %>%
  ggplot(aes(x = MONTH,y = Occurrences)) + 
    geom_boxplot(aes(fill = MONTH))+
    geom_hline(yintercept = day_mean, linetype = 2, colour = "black", size = 1.25)+ # mean of day crime occurrences
    theme(axis.text.x = element_text(angle = 90))+
    xlab("Month") + 
    ggtitle("Occurrences of crimes in each month")

Top crimes type in Huntington Avenue.

PlotOneStreetCrimes <- function(tidied_data, street_name, top = 10){
  p <- tidied_data %>%
    group_by(STREET,OFFENSE_CODE_GROUP) %>%
    summarize(Occurrcens = n()) %>%
    filter((STREET == street_name) &
             (Occurrcens >= sort(Occurrcens,decreasing = T)[top])) %>%
    ggplot(aes(OFFENSE_CODE_GROUP, Occurrcens, fill = OFFENSE_CODE_GROUP))+
      geom_bar(stat = "identity")+
      theme(axis.text.x = element_blank()) +
      xlab("Crime types") +
    ggtitle(paste("Top ",top," crime types in ",street_name,sep = ""))
  p
}

Comparison between crime types in streets.

PlotMultipleStreetsCrimes <- function(tidied_data,streets_list,crimes_list){
  p <- tidied_data %>%
    mutate(Crime_types = ifelse(OFFENSE_CODE_GROUP %in% crimes_list,OFFENSE_CODE_GROUP, "Others")) %>%
    group_by(STREET,Crime_types) %>%
    summarize(Occurrcens = n()) %>%
    filter((STREET %in% streets_list) &
             (Crime_types != "Others")) %>%
    ggplot(aes(STREET, Occurrcens, fill = Crime_types))+
    geom_bar(stat = "identity")+
    coord_flip()+
    theme(legend.position = "right")+
    xlab("Crime types") +
    ggtitle(paste("Occurrences of crimes in streets"))
  
  p
}

Crimes_types <- c("Larceny","Larceny From Motor Vehicle","Simple Assault","Aggravated Assault")
Streets_names <- c("HUNTINGTON AVE","BOYLSTON ST","COLUMBUS AVE","MASSACHUSETTS AVE","NEWBURY ST")
tidied_data %>% PlotOneStreetCrimes("HUNTINGTON AVE",10)

tidied_data %>% PlotOneStreetCrimes("COMMONWEALTH AVE",10)

tidied_data %>% PlotMultipleStreetsCrimes(Streets_names, Crimes_types)

Correlation between different types of crimes

tidied_data %>%
  group_by(crime_date,OFFENSE_CODE_GROUP) %>%
  summarize(Counts = n()) %>%
  spread(OFFENSE_CODE_GROUP,Counts) %>%
  remove_rownames() %>%
  column_to_rownames("crime_date") %>%
  apply(2,FUN = function(x){
    x[is.na(x)] <-  0
    return(x)
  }) %>%
  cor() %>%
  corrplot(type = "upper", method = "ellipse", tl.col = "black", tl.srt = 45, number.cex = .35,tl.cex = .25,outline = FALSE)

Locations for crime using the longitude and latitude.

temp <- tidied_data %>%
  na.omit() %>%
  filter(Lat != -1 & Long != -1)

temp %>%  
  ggplot(aes(x = Long, y = Lat, color = district_name)) +
    geom_point(alpha = .1) + 
    guides(colour = guide_legend(override.aes = list(alpha = 1))) +
    theme(panel.background = element_rect(fill = "white"),
                                  panel.grid = element_line(color = "black"))